
clear
set more off

*********************************************************
* Data cleaning and Identify Duplicate Observations     *
*********************************************************

use "${raw}/May_matching/CPSMay7782.dta",clear

ren x7 statecensus

order x*, last

drop if cluster=="00000"
destring hhid,replace
ren hhid hrhhid
ren mis mish

recode sex (1=2) (0=1)
recode race (1=100) (2=200) (3=700)
g ind=ind70
replace ind=0 if ind==.
g occ=occ70
replace occ=0 if occ==.

g relate=101 if _relhd<=2
replace relate=201 if _relhd==3
replace relate=1001 if _relhd==4
replace relate=1260 if _relhd>4	

drop region serial

duplicates drop _all,force //  6 obs dropped
count // Sample 691,217

duplicates tag hrhhid year statecensus mish age race sex ind occ relate grdatn,g(dups1)
count if dups1 >0 // 708 observably duplicate observations

egen id = group(hrhhid year statecensus mish age race sex ind occ relate grdatn)
bys id: g matchvar = _n
drop id

*********************************************************
* Merge    
*********************************************************

merge  1:m hrhhid year statecensus mish age race sex ind occ relate grdatn matchvar ///
		using "${wd}/May_matching/ipums_match_data.dta"
		
keep if _merge == 3 // 159 not matched
drop _merge

tab dups dups1

drop matchvar

count // Sample 691,058
save "${wd}/May_matching/may_ipums_match_final.dta",replace
	


	

